In this notebook, I do some NLP, topic modeling, sentiment analyses and visualizations to help me better understand the overall dataset. I will apply the topic modeling techniques only to 5-star rated reviews because I want to know what consumers are liking/loving. This also helps computationally given the limiter capabilities of my laptop. In addition, none of the questions in this take-home assignment were about a brand's poor performance or how to improve things. Therefore, this serves as another reason why I have excluded reviews that are rated below 5-stars.
Note: I personally have not worked with topic modeling outside of a classroom assignment context. Therefore the application of topic modeling to real world data such as this is new to me. I had to do a bit of research to find ways of implementing topic modeling and making sense of it. Therefore, several codeblocks in this notebook have been adapted from external sources which I will link in a resources document in the current directory.
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline
from sklearn.decomposition import LatentDirichletAllocation
from gensim import corpora, models
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
import sys
import re, numpy as np, pandas as pd
from pprint import pprint
from bs4 import BeautifulSoup
%matplotlib inline
import warnings
import logging
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
np.random.seed(42)
reviews = pd.read_csv('../data/reviews.csv')
I make the assumption that review sentiment should be positively correlated with star rating. To test this assumption
# merge review and title columns together
reviews['review_text'] = [''.join(i) for i in zip(reviews["title"].map(str),reviews["review"])]
# drop original review and title columns
reviews.drop(columns = ['title', 'review'], inplace = True)
# define a function that accepts text and returns the polarity
def detect_sentiment(review):
#return TextBlob(text.decode('utf-8')).sentiment.polarity
return TextBlob(review).sentiment.polarity
# create a new DataFrame column for sentiment (WARNING: SLOW!)
reviews['sentiment'] = reviews['review_text'].apply(detect_sentiment)
# box plot of sentiment grouped by star ratings (WARNING: 0 RATING MEANS THERE WAS NO RATING FOR THIS REVIEW)
reviews.boxplot(column='sentiment', by='rating');
As expected, the sentiment score increases with start rating.
# def posts_to_words(review_text):
# # Function to convert a raw review to a string of words
# # The input is a single string (a customer review), and
# # the output is a single string (a preprocessed customer review)
# # remove non-letters.
# letters_only = re.sub("[^a-zA-Z]", " ", review_text)
# # convert to lower case, split into individual words.
# words = letters_only.lower().split()
# # define stop words
# stop_words = stopwords.words('english')
# # remove stop words.
# meaningful_words = [w for w in words if not w in stop_words]
# # # apply stemming to words to bring them to their root
# # p_stemmer = PorterStemmer()
# # # stem tokens
# # stem_spam = [p_stemmer.stem.stem(i) for i in meaningful_words]
# lemmatizer = WordNetLemmatizer()
# lemmed_words = [lemmatizer.lemmatize(i) for i in meaningful_words]
# return(" ".join(lemmed_words))
# def clean_posts(data):
# print("Cleaning and parsing posts...")
# j = 0
# for post in data:
# # Convert review to words, then append to clean_train_reviews.
# clean.append(posts_to_words(post))
# # If the index is divisible by 1000, print a message
# if (j + 1) % 1000 == 0:
# print(f'Review {j + 1} of {total_reviews}.')
# j += 1
# return clean
# # Get the number of posts based on the depanx dataframe size.
# total_reviews = reviews.shape[0]
# print(f'There are {total_reviews} reviews.')
# # Initialize an empty list to hold the clean posts.
# clean = []
# #clean_test_posts = []
#clean_reviews = clean_posts(reviews['review_text'])
# reviews['review'] = clean_reviews
# # create a document-term matrix using TF-IDF
# vect = TfidfVectorizer(stop_words=stop_words, max_features = 60000, min_df = 10)
# dtm = vect.fit_transform(reviews.review)
# features = vect.get_feature_names()
# dtm.shape
reviews_5 = reviews[reviews['rating']==5]
reviews.shape
gensim allows to automatically detect common phrases comprised of multiple word expressions. Often times n-grams give more contextual info than single words.
The following several blocks of code take several hours to run
# Build the bigram and trigram models # WARNING: TOOK ABOUT 30 MINS TO RUN
bigram = gensim.models.Phrases(reviews_5['review_text'], min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[reviews_5['review_text']], threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come', 'im','ive','dont','hes','got', 'wa', 'ha', 'amazing', 'love', 'like', 'wonderful', 'great', 'really', 'very', 'obsessed', 'ever', 'every', 'never', 'awesome', 'super', 'makes', 'feels', 'good', 'absolutely', 'especially', 'honestly', 'specifically', 'generally', 'definitely','i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", 'from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come', 'im', 'ive', 'dont', 'hes', 'got', 'wa', 'ha', 'amazing', 'love', 'like', 'wonderful', 'great', 'really', 'very', 'obsessed', 'ever', 'every', 'never', 'awesome', 'super', 'makes', 'feels', 'good', 'from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come', 'im', 'ive', 'dont', 'hes', 'got', 'wa', 'ha', 'amazing', 'love', 'like', 'wonderful', 'great', 'really', 'very', 'obsessed', 'ever', 'every', 'never', 'awesome', 'super', 'makes', 'feels', 'good', 'from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come', 'im', 'ive', 'dont', 'hes', 'got', 'wa', 'ha', 'amazing', 'love', 'like', 'wonderful', 'great', 'really', 'very', 'obsessed', 'ever', 'every', 'never', 'awesome', 'super', 'makes', 'feels', 'good', 'definitely', 'especially', 'honestly'])
stop_words = list(set(stop_words))
def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
"""Remove stopwords, form bigrams, trigrams and lemmatization"""
texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
texts = [bigram_mod[doc] for doc in texts]
texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
texts_out = []
nlp = spacy.load('en', disable=['parser', 'ner'])
for sent in texts:
doc = nlp(" ".join(sent))
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
# remove stopwords once more after lemmatization
texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]
return texts_out
data_ready = process_words(reviews_5['review_text'])
# Create Dictionary
id2word = corpora.Dictionary(data_ready)
# Create Corpus: term document frequency
corpus = [id2word.doc2bow(text) for text in data_ready]
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=id2word,
num_topics=5,
random_state=100,
update_every=1,
chunksize=20,
passes=20,
alpha='symmetric',
iterations=100,
per_word_topics=True)
lda_model.print_topics()
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data_ready):
# Init output
sent_topics_df = pd.DataFrame()
# Get main topic in each document
for i, row_list in enumerate(ldamodel[corpus]):
row = row_list[0] if ldamodel.per_word_topics else row_list
# print(row)
row = sorted(row, key=lambda x: (x[1]), reverse=True)
# Get the Dominant topic, Perc Contribution and Keywords for each document
for j, (topic_num, prop_topic) in enumerate(row):
if j == 0: # => dominant topic
wp = ldamodel.show_topic(topic_num)
topic_keywords = ", ".join([word for word, prop in wp])
sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
else:
break
sent_topics_df.columns = ['dominant_topic', 'perc_contribution', 'topic_keywords']
# Add original text to the end of the output
contents = pd.Series(texts)
sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
return(sent_topics_df)
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_ready)
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['review_num', 'dominant_topic', 'topic_perc_contrib', 'keywords', 'text']
df_dominant_topic.head(4)
# display setting to show more characters in column
pd.options.display.max_colwidth = 100
sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('dominant_topic')
for i, grp in sent_topics_outdf_grpd:
sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet,
grp.sort_values(['perc_contribution'], ascending=False)],
axis=0)
# Reset index
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)
# format
sent_topics_sorteddf_mallet.columns = ['topic_num', "topic_perc_contrib", "keywords", "representative_text"]
# show
sent_topics_sorteddf_mallet.head()
print(reviews_5.shape)
print(sent_topics_sorteddf_mallet.shape)
# create an index column in sent_topics_sorteddf_mallet dataframe that will match the index of reviews_5 dataframe (I will use this column to join the two dataframes on).
sent_topics_sorteddf_mallet['index'] = reviews_5.index
# create an index column matching the index of this dataframe
reviews_5['index'] = reviews_5.index
df_dominant_topic['index'] = reviews_5.index
df_dominant_topic_2 = df_dominant_topic[['index', 'text']]
df_dominant_topic_2.head()
reviews_5 and the dataframe that contains data about the topics (sent_topics_sorteddf_mallet
data_with_topics = pd.merge(reviews_5, sent_topics_sorteddf_mallet, on = 'index')
data_with_topics.shape
data_with_topics = pd.merge(data_with_topics, df_dominant_topic, on = 'index')
data_with_topics.rename(columns = {'Text':'parsed_review'}, inplace = True)
data_with_topics.shape
data_with_topics.to_csv('../data/processed_df_topics.csv')
In the following section I won't comment on the visualizations as they look pretty self-exlanatory.
doc_lens = [len(d) for d in df_dominant_topic.text]
# Plot
plt.figure(figsize=(8,5), dpi=160)
plt.hist(doc_lens, bins = 200, color='navy')
plt.gca().set(xlim=(0, 250), ylabel='Number of Reviews', xlabel='Review Word Count')
plt.tick_params(size=16)
plt.xticks(np.linspace(0,250,9))
plt.title('Distribution of Review Word Counts', fontdict=dict(size=22))
plt.show()
# 1. Wordcloud of top n words in each topic
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] # more colors: 'mcolors.XKCD_COLORS'
cloud = WordCloud(stopwords=stop_words,
background_color='white',
width=2500,
height=1800,
max_words=20,
colormap='tab10',
color_func=lambda *args, **kwargs: cols[i],
prefer_horizontal=1.0)
topics = lda_model.show_topics(formatted=False)
fig, axes = plt.subplots(3, 2, figsize=(17,20), sharex=True, sharey=True)
for i, ax in enumerate(axes.flatten()):
fig.add_subplot(ax)
topic_words = dict(topics[i][1])
cloud.generate_from_frequencies(topic_words, max_font_size=300)
plt.gca().imshow(cloud)
plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
plt.gca().axis('off')
plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()
from collections import Counter
topics = lda_model.show_topics(formatted=False)
data_flat = [w for w_list in data_ready for w in w_list]
counter = Counter(data_flat)
out = []
for i, topic in topics:
for word, weight in topic:
out.append([word, i , weight, counter[word]])
df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])
# Plot Word Count and Weights of Topic Keywords
fig, axes = plt.subplots(2, 2, figsize=(10,10), sharey=True, dpi=160)
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
ax_twin = ax.twinx()
ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
ax.set_ylabel('Word Count', color=cols[i])
ax_twin.set_ylim(0, 0.15); ax.set_ylim(0, 70000)
ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=16)
ax.tick_params(axis='y', left=False)
ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')
fig.tight_layout(w_pad=2)
fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=22, y=1.05)
plt.show()
# Get topic weights and dominant topics
from sklearn.manifold import TSNE
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
# Get topic weights
topic_weights = []
for i, row_list in enumerate(lda_model[corpus]):
topic_weights.append([w for i, w in row_list[0]])
# Array of topic weights
arr = pd.DataFrame(topic_weights).fillna(0).values
# Keep the well separated points (optional)
arr = arr[np.amax(arr, axis=1) > 0.35]
# Dominant topic number in each doc
topic_num = np.argmax(arr, axis=1)
# tSNE Dimension Reduction
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(arr)
# Plot the topic clusters using Bokeh
output_notebook()
n_topics = 5
mycolors = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()])
plot = figure(title="t-SNE Clustering of {} LDA Topics".format(n_topics),
plot_width=900, plot_height=700)
plot.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num])
show(plot)
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis
Topics 2 and 4 are overlapping. Fruther tuning of the model is necessary to imrprove topic separation